package com.dullwolf.utils;


import com.jfinal.kit.HttpKit;
import com.jfinal.kit.StrKit;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class SpiderHtml {

    public static BufferedReader getBr(String srcUrl) {
        BufferedReader br = null;
        try {
            URL url = new URL(srcUrl);
            URLConnection urlConn = url.openConnection();
            urlConn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
            br = new BufferedReader(new InputStreamReader(urlConn.getInputStream(),"UTF-8"));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return br;
    }

    public static BufferedReader getBr(String srcUrl,String charSet) {
        BufferedReader br = null;
        try {
            URL url = new URL(srcUrl);
            URLConnection urlConn = url.openConnection();
            urlConn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
            br = new BufferedReader(new InputStreamReader(urlConn.getInputStream(),charSet));
        } catch (Exception e) {
            e.printStackTrace();
        }
        return br;
    }

    public static String spiderByKeyWithHtml(String srcUrl,Set<String> keys) {
        StringBuilder sb = new StringBuilder();
        String result = HttpKit.get(srcUrl);
        String[] split = result.split("\\n");
        try {
            for (String line : split) {
                //System.out.println(line);
                if(StrKit.isBlank(line)){
                    continue;
                }
                for (String key: keys) {
                    if(line.contains(key)){
                        //System.out.println(line);
                        sb.append(line.trim()).append("\n");
                    }
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return sb.toString();
    }


    /**
     * 这方法基本上没啥用（在粤语翻译就用来爬取页面）
     * @return
     */
    public static String spiderByKey(String srcUrl,Set<String> keys,String charSet) {
        StringBuilder sb = new StringBuilder();
        BufferedReader br;
        if(null != charSet){
            br = getBr(srcUrl,charSet);
        }else {
            br = getBr(srcUrl);
        }
        String line;
        try {
            while ((line = br.readLine()) != null) {
                if(StrKit.isBlank(line)){
                    continue;
                }
                for (String key: keys) {
                    if(line.contains(key)){
                        sb.append(guoHtml(line.trim())).append("\n");
                    }
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                assert br != null;
                br.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
        return sb.toString();
    }

    /**
     * 过滤html标签
     */
    public static String guoHtml(String s) {
        return s.replaceAll("<[.[^<]]*>", "").trim();
    }


    public static String parseFont(String html,String key) {
        String regex = "<font.*?>([\\s\\S]*?)</font>";
        //String regex = "[^.]";
        Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(html);
        StringBuilder ret = new StringBuilder();
        while(m.find()) {
            if(m.group(1).startsWith(key)){
                ret.append(m.group(1)).append("\n");
            }
        }
        return ret.toString();
    }

    public static String parseHref(String html,String key) {
        String regex = "<a[\\s]+href[\\s]*=[\\s]*\"([^<\"]+)\"";
        //String regex = "[^.]";
        Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
        Matcher m = p.matcher(html);
        StringBuilder ret = new StringBuilder();
        while(m.find()) {
            if(m.group(1).startsWith(key)){
                ret.append(m.group(1)).append("\n");
            }
        }
        return ret.toString();
    }

//    public static String parseHrefWithLike(String html,String key) {
//        String regex = "<a[\\s]+href[\\s]*=[\\s]*\"([^<\"]+)\"";
//        //String regex = "[^.]";
//        Pattern p = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
//        Matcher m = p.matcher(html);
//        StringBuilder ret = new StringBuilder();
//        while(m.find()) {
//            if(m.group(1).contains(key)){
//                ret.append(m.group(1)).append("\n");
//            }
//        }
//        return ret.toString();
//    }

}
